In [8]:
import pandas as pd
from autoc.explorer import cserie,DataExploration
from autoc.utils.helpers import *
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')
import seaborn as sns
plt.rcParams['figure.figsize'] = (12.0, 8)
np.random.seed(0)
In [9]:
# Load Give me Some credit
path = '/Users/ericfourrier/Documents/Data/Give_Me_Some_Credit/cs-training.csv'
df_train = pd.read_csv(path)
In [10]:
# if you prefer to work wit hdatabase
# from sqlalchemy import create_engine
# engine = create_engine('sqlite://')
# df_train.to_sql('cstraining',engine)
# engine.table_names()
# test = pd.read_sql("select * from cstraining",engine)
In [11]:
df_train.head(10)
Out[11]:
In [12]:
df_train.info()
In [13]:
df_train.groupby('NumberOfDependents').mean()
Out[13]:
In [14]:
exploration = DataExploration(df_train)
In [15]:
exploration.structure()
Out[15]:
In [16]:
exploration.nearzerovar()
Out[16]:
In [17]:
exploration.findcorr()
Out[17]:
In [18]:
df_train_categoric = df_train.copy()
In [19]:
# Bin true numeric columns
cols_to_bin = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome']
nb_quantiles = 10
for col in cols_to_bin:
df_train_categoric.loc[:, col] = pd.qcut(df_train_categoric.loc[:,col],nb_quantiles).astype('str')
In [20]:
#Transform ervery variables to Categorical type of pandas
# fix problem with category variable
# df_train_categoric = df_train_categoric.apply(lambda x: x.astype('str'),axis = 0)
In [21]:
df_train_categoric.dtypes
Out[21]:
In [22]:
df_train_categoric.describe()
Out[22]:
In [23]:
df_simu = df_train_categoric.copy()
In [24]:
ec = DataExploration(df_simu)
In [25]:
def simulate_na_col(df, colname, n=None, pct=None, weights=None,
safety=True, *args, **kwargs):
""" Simulate missing values in a column of categorical variables
Notes
-----
Fix issue with category variable"""
# if df.loc[:,colname].dtype == 'float' or df.loc[:,colname].dtype == 'int':
# raise ValueError('This function only support categorical variables')
if (n is None) and (pct is not None):
# be careful here especially if cols has a lot of missing values
n = int(pct * df.shape[0])
if isinstance(colname, pd.core.index.Index) or isinstance(colname, list):
for c in colname:
simulate_na_col(df, colname=c, n=n, pct=pct, weights=weights)
else:
if safety:
tokeep = keep_category(df, colname, *args, **kwargs)
col = df.loc[:, colname].drop(tokeep) # we are not smapling from tokeep
col = col.dropna()
print(colname)
col_distribution = col.value_counts(normalize=True, sort=False)
labels = col_distribution.index # characters
# generate random pmf
pmf_na = weights if weights else random_pmf(len(labels))
na_distribution = pd.Series(data=pmf_na, index=labels)
# draw samples from this pmf
weights_na = col.apply(lambda x: na_distribution[x])
weights_na /= weights_na.sum()
index_to_replace = col.sample(
n=n, weights=weights_na, replace=False).index
df.loc[index_to_replace, colname] = np.nan
In [26]:
# fix problem with category variable
simulate_na_col(df_simu,list(df_train_categoric.columns),n=80000)
In [27]:
df_simu.isnull().sum(axis = 0)
Out[27]:
In [309]:
df_simu.hist()
Out[309]:
In [310]:
df_train_categoric.hist()
Out[310]:
In [308]:
df_train.dtypes
Out[308]:
In [221]:
kl_series(df_simu.SeriousDlqin2yrs,df_train_categoric.SeriousDlqin2yrs)
Out[221]:
In [222]:
df_simu.SeriousDlqin2yrs.dropna().value_counts(normalize=True).values
Out[222]:
In [223]:
df_train_categoric.SeriousDlqin2yrs.value_counts(normalize=True).values
Out[223]:
In [224]:
kl(df_train_categoric.SeriousDlqin2yrs.value_counts(normalize=True).values,df_simu.SeriousDlqin2yrs.dropna().value_counts(normalize=True).values)
Out[224]:
In [311]:
for col in df_simu.columns:
try :
print("Kullback-Leibler divergence between both distribution: {}".format(
kl_series(df_simu.loc[:,col],df_train_categoric.loc[:,col])))
except Exception as e:
print('error:{}'.format(e))
In [1]:
from autoc import NaImputer, missing_map
In [31]:
missing_map(df_simu,nmax=1000) # no pattern visible
Out[31]:
In [28]:
na = NaImputer(df_simu)
In [32]:
na.corrplot_na() # totally missing at random
In [33]:
na.infos_na()
Out[33]:
In [226]:
ec.structure()
Out[226]:
In [231]:
# Dirty cleaning
df_simu.loc[df_simu.NumberOfOpenCreditLinesAndLoans >=10,'NumberOfOpenCreditLinesAndLoans'] =10
df_simu.loc[df_simu.NumberRealEstateLoansOrLines >=5,'NumberRealEstateLoansOrLines'] = 5
df_simu.loc[df_simu.NumberOfTimes90DaysLate >=5,'NumberOfTimes90DaysLate'] = 5
df_simu.loc[df_simu.NumberOfTimes90DaysLate >=5,'NumberOfTimes90DaysLate'] = 5
In [243]:
df_simu.age = pd.qcut(df_simu.age,10)
In [244]:
df_simu.age.value_counts()
Out[244]:
In [245]:
DataExploration(df_simu).structure()
Out[245]:
In [271]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
test = pd.get_dummies(df_simu[df_simu.age.isnull()].drop('age',axis =1).fillna('mean')).as_matrix()
X = pd.get_dummies(df_simu[df_simu.age.notnull()].drop('age',axis =1).fillna('mean')).as_matrix()
y = df_simu.age[df_simu.age.notnull()].values
clf.fit(X, y)
Out[271]:
In [275]:
X.shape
Out[275]:
In [277]:
# train prediction
X.shape
clf.predict(X)
Out[277]:
In [278]:
clf.predict_proba(X)
Out[278]:
In [279]:
clf.score(X, y, sample_weight=None)
Out[279]:
In [282]:
res= np.array([clf.predict(X),y]).T
In [283]:
res
Out[283]:
In [273]:
# test prediction
test.shape
#clf.predict(test)
Out[273]:
In [251]:
y
Out[251]:
In [ ]:
df_simu.drop('age',axis =1)
In [ ]:
df_simu.age.isnull()
In [182]:
df_simu.age
Out[182]: